package com.webull.information.center.carwler.common.util.jsoup.prase_en;

import java.io.IOException;
import java.text.ParseException;
import java.util.Date;
import java.util.Optional;

import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;

import com.webull.framework.util.UtilDate;
import com.webull.information.center.carwler.common.model.NewsInformation;
import com.webull.information.center.carwler.common.util.jsoup.HtmlBodyPrase;
import com.webull.information.center.common.constants.Constants;

/**
 * 南华早报
 * 
 * @author shimingjun
 * @date 2016年8月22日 下午4:57:28
 * @version 1.0
 * @since JDK 1.8
 */
public class Scmp_HtmlPrase implements HtmlBodyPrase {
	protected final Logger logger = LogManager.getLogger(getClass());

	/**
	 * for example
	 * :http://news.cmlviz.com/2016/07/21/tivo-inc-and-verint-systems-inc-head-
	 * to-head-compare.html
	 */
	@Override
	public void praseNewsInfo(org.jsoup.nodes.Document doc, NewsInformation info) {
		try {

			// title
			Optional.ofNullable(doc.getElementById("page-title")).map(h1 -> StringUtils.stripToNull(h1.html()))
					.ifPresent(h1_0 -> info.setTitle(h1_0));
			// sourceName
			if (StringUtils.isBlank(info.getSourceName())) {
				info.setSourceName("scmp.com");
			}
			if (StringUtils.isBlank(info.getLanguage())) {
				info.setLanguage(Constants.lang_en);
			}
			// new time
			Optional.ofNullable(doc.select("div.node-published").first())
					.map(time2 -> StringUtils.stripToNull(time2.attr("datetime"))).ifPresent(pdate -> {
						// datetime="2012-12-15T00:00:00+08:00"
						info.setPushTime(pdate);
						try {
							String day = StringUtils.stripToEmpty(StringUtils.substringBefore(pdate, "T"));
							String time = StringUtils.substringBefore(StringUtils.substringAfter(pdate, "T"), "+");
							String timeZone = StringUtils.removePattern(StringUtils.substringAfterLast(pdate, "+"),
									"0|:");
							if (!NumberUtils.isNumber(timeZone)) {
								timeZone = "0";
							}
							Date d = UtilDate.parse(day + " " + time, Integer.valueOf(timeZone), "yyyy-MM-dd HH:mm:ss");
							Optional.ofNullable(d).ifPresent(d0 -> info.setNewsTime(d));
						} catch (Exception e) {
						}
					});

			// 新闻正文

			Optional.ofNullable(doc.select("div.pane-node-body>div.pane-content").first())
					.map(ps0 -> ps0.getElementsByTag("P")).ifPresent(ps0 -> {

						for (int i = ps0.size() - 1; i >= 0; i--) {
							Element p = ps0.get(i);
							if (!p.hasText()) {
								ps0.remove(i);
							} else {
								ps0.removeAttr("class");
							}
						}
						info.setContent(StringUtils.stripToNull(ps0.outerHtml()));
					});
		} catch (Exception e) {
			logger.warn(e);
		}
	}

	public static void main(String[] args) throws ParseException, IOException {
		String url2 = "http://www.scmp.com/news/china/article/1105722/briefs-december-15-2012";
		// url2 =
		// "http://scmp.com/news/hong-kong/education-community/article/1942196/home-working-mums-earn-little-dignity";
		// url2 =
		// "http://www.scmp.com/news/china/article/1930006/watch-incredible-video-fearless-rooftoppers-scaling-crane-opposite-chinas";
		Connection connection = Jsoup.connect(url2).userAgent(
				"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
				// .header("x-client-data",
				// "CIq2yQEIpbbJAQjEtskBCP2VygEIwpjKAQjwnMoB")
				.header("x-client-data", RandomStringUtils.randomAlphanumeric(40));

		connection.proxy("127.0.0.1", 1080);
		org.jsoup.nodes.Document doc = connection.timeout(10000).get();
		NewsInformation info = new NewsInformation();
		new Scmp_HtmlPrase().praseNewsInfo(doc, info);
		System.out.println(info);

		// Mon Mar 17, 2014 9:46pm EDT
		// Wed Jan 15 00:00:00 CST 2014
		// Date d = UtilDate.parse("Mon Mar 17, 2014 9:46pm", Locale.US,
		// TimeZone.getTimeZone("EDT"), "E MMM dd, yyyy hh:mma");
		// System.out.println(d);

	}
}
